This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
## Rows: 100000 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): gender, smoking_history
## dbl (7): age, hypertension, heart_disease, bmi, HbA1c_level, blood_glucose_l...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 100,000 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 80 0 1 never 25.2 6.6
## 2 Female 54 0 0 No Info 27.3 6.6
## 3 Male 28 0 0 never 27.3 5.7
## 4 Female 36 0 0 current 23.4 5
## 5 Male 76 1 1 current 20.1 4.8
## 6 Female 20 0 0 never 27.3 6.6
## 7 Female 44 0 0 never 19.3 6.5
## 8 Female 79 0 0 No Info 23.9 5.7
## 9 Male 42 0 0 never 33.6 4.8
## 10 Female 32 0 0 never 27.3 5
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 58,552 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 80 0 1 never 25.2 6.6
## 2 Female 54 0 0 No Info 27.3 6.6
## 3 Female 36 0 0 current 23.4 5
## 4 Female 20 0 0 never 27.3 6.6
## 5 Female 44 0 0 never 19.3 6.5
## 6 Female 79 0 0 No Info 23.9 5.7
## 7 Female 32 0 0 never 27.3 5
## 8 Female 53 0 0 never 27.3 6.1
## 9 Female 54 0 0 former 54.7 6
## 10 Female 78 0 0 former 36.0 5
## # ℹ 58,542 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
## n
## <int>
## 1 27397
## # A tibble: 1 × 1
## n
## <int>
## 1 18865
## # A tibble: 1,267 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Male 67 0 1 not current 27.3 6.5
## 2 Male 57 1 1 not current 27.8 6.6
## 3 Male 80 0 1 former 24.4 7.5
## 4 Male 75 0 1 not current 28.1 7.5
## 5 Male 69 0 1 former 24.1 6.8
## 6 Female 59 0 1 never 60.3 8.8
## 7 Male 80 0 1 former 33.0 6
## 8 Female 62 1 1 former 44.2 8.2
## 9 Female 62 1 1 never 43.2 8.8
## 10 Female 76 0 1 former 25.7 9
## # ℹ 1,257 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 3,942 × 10
## # Groups: bmi >= 30 [2]
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 80 0 1 never 25.2 6.6
## 2 Male 76 1 1 current 20.1 4.8
## 3 Female 72 0 1 former 27.9 6.5
## 4 Male 67 0 1 not current 27.3 6.5
## 5 Female 77 1 1 never 32.0 5
## 6 Female 59 0 1 ever 23.1 6.5
## 7 Male 68 1 1 current 27.3 5
## 8 Male 59 0 1 ever 30.8 5
## 9 Female 80 0 1 never 29.6 5.8
## 10 Male 57 1 1 not current 27.8 6.6
## # ℹ 3,932 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## # `bmi >= 30` <lgl>
## # A tibble: 3,942 × 10
## # Groups: bmi >= 30 [2]
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 80 0 1 never 25.2 6.6
## 2 Male 76 1 1 current 20.1 4.8
## 3 Female 72 0 1 former 27.9 6.5
## 4 Male 67 0 1 not current 27.3 6.5
## 5 Female 77 1 1 never 32.0 5
## 6 Female 59 0 1 ever 23.1 6.5
## 7 Male 68 1 1 current 27.3 5
## 8 Male 59 0 1 ever 30.8 5
## 9 Female 80 0 1 never 29.6 5.8
## 10 Male 57 1 1 not current 27.8 6.6
## # ℹ 3,932 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## # `bmi >= 30` <lgl>
## # A tibble: 1,903 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Male 50 0 0 former 37.2 9
## 2 Male 53 0 0 current 30.8 6.6
## 3 Male 76 0 0 never 31.9 7.5
## 4 Male 63 1 0 ever 35.1 5.8
## 5 Male 48 1 0 current 36.1 6.8
## 6 Male 37 0 0 never 37.2 7
## 7 Male 36 0 0 not current 46.1 6.2
## 8 Male 50 0 0 never 31.8 7.5
## 9 Male 43 0 0 never 69.4 7.5
## 10 Male 43 1 0 not current 40.9 6.6
## # ℹ 1,893 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
## n
## <int>
## 1 1903
## # A tibble: 2,330 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 67 0 0 never 63.5 8.8
## 2 Female 36 0 0 current 32.3 6.2
## 3 Female 77 0 0 never 31.7 6.5
## 4 Female 47 0 0 never 36.5 7.5
## 5 Female 61 0 0 not current 39.4 9
## 6 Female 80 0 0 former 36.2 6.5
## 7 Female 52 1 0 never 50.3 6.6
## 8 Female 68 0 0 No Info 40.3 7.5
## 9 Female 70 0 0 not current 33.2 7.5
## 10 Female 67 0 0 former 32.3 7
## # ℹ 2,320 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
## n
## <int>
## 1 2330
## # A tibble: 21 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Male 42 0 0 current 11.9 6
## 2 Male 6 0 0 never 15.7 6.1
## 3 Male 71 1 0 former 13.2 6.6
## 4 Male 14 0 0 never 19.0 6.6
## 5 Male 54 0 0 never 18.9 6
## 6 Male 61 1 0 never 18.4 6.5
## 7 Male 4 0 0 never 18.7 6
## 8 Male 51 0 0 current 17.8 6.2
## 9 Male 80 1 0 current 19.0 6.6
## 10 Male 6 0 0 No Info 15.6 9
## # ℹ 11 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
## n
## <int>
## 1 21
## # A tibble: 57 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 79 0 0 not current 18.1 7
## 2 Female 4 0 0 No Info 15.0 6.5
## 3 Female 51 0 0 current 17.4 7
## 4 Female 9 0 0 never 16 6.1
## 5 Female 60 0 0 No Info 17.9 8.2
## 6 Female 13 0 0 No Info 17.3 6.2
## 7 Female 80 0 0 never 17.4 6.5
## 8 Female 8 0 0 No Info 14.3 7.5
## 9 Female 80 0 0 never 17.8 6.2
## 10 Female 78 1 0 not current 17.7 8.8
## # ℹ 47 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
## n
## <int>
## 1 57
## # A tibble: 7,445 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Male 42 0 0 never 33.6 4.8
## 2 Male 15 0 0 never 30.4 6.1
## 3 Male 40 0 0 current 36.4 6
## 4 Male 30 0 0 never 33.8 6.1
## 5 Male 34 0 0 never 31.2 5.8
## 6 Male 54 0 0 never 31.9 6.6
## 7 Male 79 0 0 former 31.2 5.8
## 8 Male 54 0 0 former 32.8 5
## 9 Male 38 0 0 never 55.6 6.5
## 10 Male 58 0 0 former 36.5 5.8
## # ℹ 7,435 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
## n
## <int>
## 1 7445
## # A tibble: 11,852 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 54 0 0 former 54.7 6
## 2 Female 78 0 0 former 36.0 5
## 3 Female 53 0 0 No Info 31.8 4
## 4 Female 34 0 0 never 56.4 6.2
## 5 Female 77 1 1 never 32.0 5
## 6 Female 27 0 0 not current 30.2 5.7
## 7 Female 37 0 0 No Info 30.5 5.7
## 8 Female 56 0 0 never 31.0 6.5
## 9 Female 44 0 0 never 37.4 5.7
## 10 Female 30 0 0 No Info 50.1 6
## # ℹ 11,842 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 1 × 1
## n
## <int>
## 1 11852
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the plot.
Note that the echo = FALSE parameter was added to the code
chunk to prevent printing of the R code that generated the plot.
In this section we want to display a stack bar showing how HbA1c levels (average blood sugar levels) can classify Males and Females in different categories such as Normal, Prediabetes, and Diabetes. From this plot we gain insight on how the distribution of Normal, Prediabetes, and Diabetes varies between both genders.
First we load our libraries such as dplyr for data manipulation, ggplot2 for data visualization, and plotly to make our plot interactive. Then we we display our original dataset.
From our original data set we create a new dataset called HbA1c_by_gender. For our new dataset we want to keep only the Male and Female gender and exclude Other, the way to do this is by filtering our gender column to not equal to ‘Other’. We also create a new variable called HbA1c_category with the use of our mutate function. We use the case_when function to classify our HbA1c_level column and return different categories such as ‘Normal’, ‘Prediabetes’, and ‘Diabetes’.
Then we print out our mutated dataset to make a comparison of the original.
In the next section we want our stack bar to follow a certain order, we want Normal at the top, Prediabetes at the middle, and Diabetes at the bottom. To do this we will mutate our HbA1c_category column to an ordered categorical variable with the use of the factor function, the levels argument helps to set the order we want.
Next, we will plot a stack bar using ggplot. Before that we will change the name of our data set to ‘Interactive_mode’ that way we can easily incorporate our data set to ggplotly. The scale_fill_manual function is used to manually assign colors to the different categories.
Lastly, with the use of plotly we transform our plot into an interactive plot. When approaching the table we are able to see the count, gender, and HbA1c_category for any of the stacked bars.
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## # A tibble: 100,000 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 80 0 1 never 25.2 6.6
## 2 Female 54 0 0 No Info 27.3 6.6
## 3 Male 28 0 0 never 27.3 5.7
## 4 Female 36 0 0 current 23.4 5
## 5 Male 76 1 1 current 20.1 4.8
## 6 Female 20 0 0 never 27.3 6.6
## 7 Female 44 0 0 never 19.3 6.5
## 8 Female 79 0 0 No Info 23.9 5.7
## 9 Male 42 0 0 never 33.6 4.8
## 10 Female 32 0 0 never 27.3 5
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
## # A tibble: 99,982 × 10
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 80 0 1 never 25.2 6.6
## 2 Female 54 0 0 No Info 27.3 6.6
## 3 Male 28 0 0 never 27.3 5.7
## 4 Female 36 0 0 current 23.4 5
## 5 Male 76 1 1 current 20.1 4.8
## 6 Female 20 0 0 never 27.3 6.6
## 7 Female 44 0 0 never 19.3 6.5
## 8 Female 79 0 0 No Info 23.9 5.7
## 9 Male 42 0 0 never 33.6 4.8
## 10 Female 32 0 0 never 27.3 5
## # ℹ 99,972 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## # HbA1c_category <chr>
library(ggplot2)
# Example dataset (replace with actual data)
data <- data.frame(
gender = factor(rep(c("Male", "Female", "Other"), each = 50)),
bmi = c(runif(50, 18, 40), runif(50, 18, 40), runif(50, 18, 40)),
diabetes = factor(sample(c("Yes", "No"), 150, replace = TRUE))
)
# Creating the plot
ggplot(data, aes(x = gender, y = bmi, fill = diabetes)) +
geom_boxplot() +
labs(title = "BMI Distribution by Gender and Diabetes Status",
x = "Gender",
y = "BMI") +
theme_minimal()
# here I'll leave extra info for you guys regarding the gender column of the original data set
diabetes_dataset %>% filter(gender == 'Female') %>% tally # 58,552 we have 17,122 more females than males in this data set
## # A tibble: 1 × 1
## n
## <int>
## 1 58552
diabetes_dataset %>% filter(gender == 'Male') %>% tally # 41,430
## # A tibble: 1 × 1
## n
## <int>
## 1 41430
diabetes_dataset %>% filter(gender == 'Other') %>% tally # 18
## # A tibble: 1 × 1
## n
## <int>
## 1 18
# check why most ppl dont have 5.3 because we are working with a bunch of ppl ****
# turn heart disease into a category
# when we graph we see that this data may not be legit bc ppl who are older that 80 seem to be pushed out
``` r
library(dplyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
diabetes_dataset %>%
filter(age >= 3) %>% # Ensure ages 3-80 are included
select(age, diabetes, heart_disease) %>%
pivot_longer(cols = c(diabetes, heart_disease), names_to = "condition", values_to = "status") %>%
filter(status == 1) %>% # Keep only cases where the condition is present
ggplot(aes(x = age, fill = condition)) +
geom_density(alpha = 0.6, adjust = 1.5) +
scale_x_continuous(limits = c(3, 80), breaks = seq(3, 80, by = 7)) +
scale_y_continuous(labels = scales::percent_format(scale = 1)) + # Show density as percentages
scale_fill_manual(values = c("red", "purple"), labels = c("Diabetes", "Heart Disease")) +
labs(title = "Density of Diabetes & Heart Disease Across Age Groups",
x = "Age", y = "Percentage Density", fill = "Condition") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
diabetes_dataset
## # A tibble: 100,000 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 80 0 1 never 25.2 6.6
## 2 Female 54 0 0 No Info 27.3 6.6
## 3 Male 28 0 0 never 27.3 5.7
## 4 Female 36 0 0 current 23.4 5
## 5 Male 76 1 1 current 20.1 4.8
## 6 Female 20 0 0 never 27.3 6.6
## 7 Female 44 0 0 never 19.3 6.5
## 8 Female 79 0 0 No Info 23.9 5.7
## 9 Male 42 0 0 never 33.6 4.8
## 10 Female 32 0 0 never 27.3 5
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
diabetes_dataset %>% select(age, diabetes, heart_disease) %>% filter(age >= 3, diabetes == 1, heart_disease == 1) %>% arrange(age) %>% ggplot(aes(x = age)) +
geom_density(alpha = 0.6, adjust = 1.5) +
theme_minimal()
diabetes_dataset
## # A tibble: 100,000 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 80 0 1 never 25.2 6.6
## 2 Female 54 0 0 No Info 27.3 6.6
## 3 Male 28 0 0 never 27.3 5.7
## 4 Female 36 0 0 current 23.4 5
## 5 Male 76 1 1 current 20.1 4.8
## 6 Female 20 0 0 never 27.3 6.6
## 7 Female 44 0 0 never 19.3 6.5
## 8 Female 79 0 0 No Info 23.9 5.7
## 9 Male 42 0 0 never 33.6 4.8
## 10 Female 32 0 0 never 27.3 5
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>% filter(age >= 2, diabetes == 1)
heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(age >= 2, heart_disease == 1)
bmi_older_than_two <- diabetes_dataset %>% select(age, bmi) %>% filter(age >= 2)
ggplot() +
geom_density(data = diabetes_only, aes(x = age), fill = "blue", alpha = 0.5) + # Diabetes cases
geom_density(data = heart_disease_only, aes(x = age), fill = "red", alpha = 0.5) + # Heart disease only
geom_density(data = bmi_older_than_two, aes(x = age), fill = "magenta", alpha = 0.5) +
labs(title = "Age Distribution: Diabetes vs. Heart Disease",
x = "Age",
y = "Density") +
theme_minimal()
3666 #10.01 smallest, largest
bmi_older_than_two <- diabetes_dataset %>% select(age, bmi) %>% filter(age >= 2, bmi == ‘NA’) %>% arrange(age)
diabetes_dataset %>% select(age, diabetes) %>% filter(diabetes == 1) %>% arrange(-age) #3 yo the youngest, oldest 80
diabetes_dataset %>% select(age, heart_disease) %>% filter(heart_disease == 1) %>% arrange(-age) # youngest 2, oldest 80
library(ggplot2) library(dplyr)
diabetes_dataset_condition <- diabetes_dataset %>% mutate(condition = case_when( diabetes == 1~ “Diabetes Only”, heart_disease == 1 ~ “Heart Disease Only” )) %>%
filter(!is.na(condition))
ggplot(diabetes_dataset_condition) + geom_point(aes(x = age, y = bmi, color = condition), alpha = 0.2) + # Scatter plot labs(title = “BMI vs. Age Across Diabetes & Heart Disease”, x = “Age”, y = “BMI”, color = “Condition”) + theme_minimal()
library(ggplot2) library(dplyr) #summary(diabetes_dataset\(bmi) #diabetes_dataset\)bmi <- as.numeric(diabetes_dataset$bmi)
diabetes_dataset_filtered_bmi <- diabetes_dataset %>% mutate(bmi_filtered = ifelse(bmi >= 2, bmi, NA))
diabetes_only <- diabetes_dataset_filtered_bmi %>% select(age, bmi_filtered, diabetes) %>% filter(age >= 2, diabetes == 1) %>% mutate(condition = “Diabetes Only”)
heart_disease_only <- diabetes_dataset_filtered_bmi %>% select(age, bmi_filtered, heart_disease) %>% filter(age >= 2, heart_disease == 1) %>% mutate(condition = “Heart Disease Only”)
combined_data <- bind_rows(diabetes_only, heart_disease_only)
ggplot(combined_data) + geom_point(aes(x = age, y = bmi_filtered, color = condition), alpha = 0.2) + # Scatter plot with BMI geom_jitter(aes(x = age, y = bmi_filtered, color = condition), width = 0.1, height = 0.1, alpha = 0.3) + scale_color_manual(values = c(“Diabetes Only” = “deeppink”, “Heart Disease Only” = “darkblue”)) + geom_smooth(aes(x = age, y = bmi_filtered), method = “loess”, size = 0.8, color = “red”, se = FALSE) + # Single tr0end line scale_x_continuous(breaks = seq(0, 80, by = 10)) + scale_y_continuous(breaks = seq(10.01, 95.69, by = 10)) + labs(title = “BMI vs. Age Across Diabetes & Heart Disease”, x = “Age”, y = “BMI”, color = “Condition”) + theme_bw() + theme(plot.title = element_text(hjust = 0.5))
library(ggplot2) library(dplyr)
diabetes_dataset_filtered_bmi <- diabetes_dataset %>% mutate(bmi_filtered = ifelse(bmi >= 2, bmi, NA))
diabetes_only <- diabetes_dataset_filtered_bmi %>% select(age, bmi_filtered, diabetes) %>% filter(age >= 2, diabetes == 1) %>% mutate(condition = “Diabetes Only”)
heart_disease_only <- diabetes_dataset_filtered_bmi %>% select(age, bmi_filtered, heart_disease) %>% filter(age >= 2, heart_disease == 1) %>% mutate(condition = “Heart Disease Only”)
combined_data <- bind_rows(diabetes_only, heart_disease_only)
ggplot(combined_data) + geom_point(aes(x = age, y = bmi_filtered, color = condition), alpha = 0.3) + # Scatter plot with BMI geom_jitter(aes(x = age, y = bmi_filtered, color = condition), width = 0.1, height = 0.1, alpha = 0.3) + scale_color_manual(values = c(“Diabetes Only” = “cornflowerblue”, “Heart Disease Only” = “darkorchid4”)) + geom_smooth(aes(x = age, y = bmi_filtered), method = “loess”, size = 1, color = “red”, se = FALSE) + # Single tr0end line scale_x_continuous(breaks = seq(0, 80, by = 10)) + scale_y_continuous(breaks = seq(10.01, 95.69, by = 10)) + labs(title = “BMI vs. Age Across Diabetes & Heart Disease”, x = “Age”, y = “BMI”, color = “Condition”) + theme_bw() + theme(plot.title = element_text(hjust = 0.5)) #ggplotly(interactive_combined_data) ####
bmi_older_than_two <- diabetes_dataset %>% select(age, bmi) %>% filter(age >= 2, bmi == ‘N/A’) # min bmi 10.01 # max bmi 95.69 diabetes_dataset_filtered_bmi <- diabetes_dataset %>% mutate(bmi_filtered = ifelse(bmi >= 2, bmi, NA)) diabetes_dataset_filtered_bmi %>% arrange(-bmi_filtered)
```{=html}
<div class="plotly html-widget html-fill-item" id="htmlwidget-a6376b90d3fb62343749" style="width:672px;height:576px;"></div>
<script type="application/json" data-for="htmlwidget-a6376b90d3fb62343749">{"x":{"data":[{"orientation":"v","width":[0.89999999999999991,0.90000000000000013],"base":[0.6158628227899986,0.62930243784697082],"x":[1,2],"y":[0.3841371772100014,0.37069756215302918],"text":["gender: Female<br />n: 0.3841372<br />HbA1c_category: Normal < 5.7%","gender: Male<br />n: 0.3706976<br />HbA1c_category: Normal < 5.7%"],"type":"bar","textposition":"none","marker":{"autocolorscale":false,"color":"rgba(255,248,220,0.6)","line":{"width":0.37795275590551186,"color":"rgba(0,0,0,1)"}},"name":"Normal < 5.7%","legendgroup":"Normal < 5.7%","showlegend":true,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null},{"orientation":"v","width":[0.89999999999999991,0.90000000000000013],"base":[0.20212802295395546,0.21624426743905384],"x":[1,2],"y":[0.41373479983604311,0.41305817040791698],"text":["gender: Female<br />n: 0.4137348<br />HbA1c_category: Prediabetes 5.7% - 6.4%","gender: Male<br />n: 0.4130582<br />HbA1c_category: Prediabetes 5.7% - 6.4%"],"type":"bar","textposition":"none","marker":{"autocolorscale":false,"color":"rgba(189,183,107,0.6)","line":{"width":0.37795275590551186,"color":"rgba(0,0,0,1)"}},"name":"Prediabetes 5.7% - 6.4%","legendgroup":"Prediabetes 5.7% - 6.4%","showlegend":true,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null},{"orientation":"v","width":[0.89999999999999991,0.90000000000000013],"base":[0,0],"x":[1,2],"y":[0.20212802295395546,0.21624426743905384],"text":["gender: Female<br />n: 0.2021280<br />HbA1c_category: Diabetes ≥ 6.5%","gender: Male<br />n: 0.2162443<br />HbA1c_category: Diabetes ≥ 6.5%"],"type":"bar","textposition":"none","marker":{"autocolorscale":false,"color":"rgba(139,101,8,0.6)","line":{"width":0.37795275590551186,"color":"rgba(0,0,0,1)"}},"name":"Diabetes ≥ 6.5%","legendgroup":"Diabetes ≥ 6.5%","showlegend":true,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null},{"x":[1,2],"y":[0.80793141139499935,0.81465121892348535],"text":["38.4%","37.1%"],"hovertext":["gender: Female<br />n: 22492<br />HbA1c_category: Normal < 5.7%<br />paste0(round(percent, 1), \"%\"): 38.4%","gender: Male<br />n: 15358<br />HbA1c_category: Normal < 5.7%<br />paste0(round(percent, 1), \"%\"): 37.1%"],"textfont":{"size":14.66456692913386,"color":"rgba(0,0,0,1)"},"type":"scatter","mode":"text","hoveron":"points","name":"Normal < 5.7%","legendgroup":"Normal < 5.7%","showlegend":false,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null},{"x":[1,2],"y":[0.40899542287197704,0.4227733526430123],"text":["41.4%","41.3%"],"hovertext":["gender: Female<br />n: 24225<br />HbA1c_category: Prediabetes 5.7% - 6.4%<br />paste0(round(percent, 1), \"%\"): 41.4%","gender: Male<br />n: 17113<br />HbA1c_category: Prediabetes 5.7% - 6.4%<br />paste0(round(percent, 1), \"%\"): 41.3%"],"textfont":{"size":14.66456692913386,"color":"rgba(0,0,0,1)"},"type":"scatter","mode":"text","hoveron":"points","name":"Prediabetes 5.7% - 6.4%","legendgroup":"Prediabetes 5.7% - 6.4%","showlegend":false,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null},{"x":[1,2],"y":[0.10106401147697773,0.10812213371952692],"text":["20.2%","21.6%"],"hovertext":["gender: Female<br />n: 11835<br />HbA1c_category: Diabetes ≥ 6.5%<br />paste0(round(percent, 1), \"%\"): 20.2%","gender: Male<br />n: 8959<br />HbA1c_category: Diabetes ≥ 6.5%<br />paste0(round(percent, 1), \"%\"): 21.6%"],"textfont":{"size":14.66456692913386,"color":"rgba(0,0,0,1)"},"type":"scatter","mode":"text","hoveron":"points","name":"Diabetes ≥ 6.5%","legendgroup":"Diabetes ≥ 6.5%","showlegend":false,"xaxis":"x","yaxis":"y","hoverinfo":"text","frame":null}],"layout":{"margin":{"t":42.057838660578383,"r":7.3059360730593621,"b":38.477929984779308,"l":48.949771689497723},"plot_bgcolor":"rgba(255,255,255,1)","paper_bgcolor":"rgba(255,255,255,1)","font":{"color":"rgba(0,0,0,1)","family":"","size":14.611872146118724},"title":{"text":"Male vs. Female Blood Sugar Levels (HbA1c)","font":{"color":"rgba(0,0,0,1)","family":"","size":17.534246575342465},"x":0.5,"xref":"paper"},"xaxis":{"domain":[0,1],"automargin":true,"type":"linear","autorange":false,"range":[0.40000000000000002,2.6000000000000001],"tickmode":"array","ticktext":["Female","Male"],"tickvals":[1,2],"categoryorder":"array","categoryarray":["Female","Male"],"nticks":null,"ticks":"outside","tickcolor":"rgba(51,51,51,1)","ticklen":3.6529680365296811,"tickwidth":0.66417600664176002,"showticklabels":true,"tickfont":{"color":"rgba(77,77,77,1)","family":"","size":11.68949771689498},"tickangle":-0,"showline":true,"linecolor":"rgba(0,0,0,1)","linewidth":0.66417600664176002,"showgrid":false,"gridcolor":null,"gridwidth":0,"zeroline":false,"anchor":"y","title":{"text":"Gender","font":{"color":"rgba(0,0,0,1)","family":"","size":14.611872146118724}},"hoverformat":".2f"},"yaxis":{"domain":[0,1],"automargin":true,"type":"linear","autorange":false,"range":[-0.050000000000000003,1.05],"tickmode":"array","ticktext":["0.00","0.25","0.50","0.75","1.00"],"tickvals":[0,0.25,0.5,0.75,1],"categoryorder":"array","categoryarray":["0.00","0.25","0.50","0.75","1.00"],"nticks":null,"ticks":"outside","tickcolor":"rgba(51,51,51,1)","ticklen":3.6529680365296811,"tickwidth":0.66417600664176002,"showticklabels":true,"tickfont":{"color":"rgba(77,77,77,1)","family":"","size":11.68949771689498},"tickangle":-0,"showline":true,"linecolor":"rgba(0,0,0,1)","linewidth":0.66417600664176002,"showgrid":false,"gridcolor":null,"gridwidth":0,"zeroline":false,"anchor":"x","title":{"text":"Proportion","font":{"color":"rgba(0,0,0,1)","family":"","size":14.611872146118724}},"hoverformat":".2f"},"shapes":[{"type":"rect","fillcolor":null,"line":{"color":null,"width":0,"linetype":[]},"yref":"paper","xref":"paper","x0":0,"x1":1,"y0":0,"y1":1}],"showlegend":true,"legend":{"bgcolor":"rgba(255,255,255,1)","bordercolor":"transparent","borderwidth":1.8897637795275593,"font":{"color":"rgba(0,0,0,1)","family":"","size":11.68949771689498},"title":{"text":"HbA1c Category","font":{"color":"rgba(0,0,0,1)","family":"","size":14.611872146118724}}},"hovermode":"closest","barmode":"relative"},"config":{"doubleClick":"reset","modeBarButtonsToAdd":["hoverclosest","hovercompare"],"showSendToCloud":false},"source":"A","attrs":{"5ff0750a5ba2":{"x":{},"y":{},"fill":{},"type":"bar"},"5ff02bf4b6c":{"x":{},"y":{},"fill":{},"label":{}}},"cur_data":"5ff0750a5ba2","visdat":{"5ff0750a5ba2":["function (y) ","x"],"5ff02bf4b6c":["function (y) ","x"]},"highlight":{"on":"plotly_click","persistent":false,"dynamic":false,"selectize":false,"opacityDim":0.20000000000000001,"selected":{"opacity":1},"debounce":0},"shinyEvents":["plotly_hover","plotly_click","plotly_selected","plotly_relayout","plotly_brushed","plotly_brushing","plotly_clickannotation","plotly_doubleclick","plotly_deselect","plotly_afterplot","plotly_sunburstclick"],"base_url":"https://plot.ly"},"evals":[],"jsHooks":[]}</script>
diabetes_dataset
diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>% filter(age >= 2, diabetes == 1) heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(age >= 2, heart_disease == 1) bmi_older_than_two <- diabetes_dataset %>% select(age, bmi) %>% filter(age >= 2)
ggplot() + geom_density(data = diabetes_only, aes(x = age), fill =
“blue”, alpha = 0.5) + # Diabetes cases geom_density(data =
heart_disease_only, aes(x = age), fill = “red”, alpha = 0.5) + # Heart
disease only geom_density(data = bmi_older_than_two, aes(x = age), fill
= “magenta”, alpha = 0.5) +
labs(title = “Age Distribution: Diabetes vs. Heart Disease”, x = “Age”,
y = “Density”) + theme_minimal()
diabetes_dataset
diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>% filter(diabetes == 1) #y: 3 o: 80 dia_count <- diabetes_only %>% tally() dia_count
blood_glucose_dataset <- diabetes_dataset %>% select(age, blood_glucose_level) %>% filter(blood_glucose_level != ‘NA’, age >= 2) #y:2 o: 80 bg_count <- blood_glucose_dataset %>% tally() bg_count
heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(heart_disease == 1) #y: 2 o: 80 hd_count <- heart_disease_only %>% tally() hd_count
all_in_one <- diabetes_dataset %>% select(age, diabetes, blood_glucose_level,heart_disease) %>% filter(diabetes == 1, blood_glucose_level != ‘NA’, age >= 2, heart_disease == 1) all_in_one
ggplot() + geom_density(data = diabetes_only, aes(x = blood_glucose_level), fill = “blue”, alpha = 0.5) + # Diabetes cases geom_density(data = blood_glucose_dataset, aes(x = blood_glucose_level), fill = “red”, alpha = 0.5) + # Blood glucose levels geom_density(data = heart_disease_only, aes(x = blood_glucose_level), fill = “magenta”, alpha = 0.5) + # Heart disease cases labs(title = “Blood Glucose Distribution: Diabetes vs. Heart Disease”, x = “Blood Glucose Level”, y = “Density”) + theme_minimal()
library(ggplot2) library(dplyr) library(DT)
diabetes_only <- diabetes_dataset %>% select(age, diabetes, blood_glucose_level) %>% filter(diabetes == 1, !is.na(blood_glucose_level))
blood_glucose_dataset <- diabetes_dataset %>% select(age,
blood_glucose_level) %>%
filter(!is.na(blood_glucose_level), age >= 2)
heart_disease_only <- diabetes_dataset %>% select(age, heart_disease, blood_glucose_level) %>% filter(heart_disease == 1, !is.na(blood_glucose_level))
ggplot(diabetes_dataset, aes(x = blood_glucose_level, fill =
after_stat(density))) + geom_density(alpha = 0.5) +
geom_density(data = blood_glucose_dataset, aes(x = blood_glucose_level),
fill = “red”, alpha = 0.5) +
geom_density(data = heart_disease_only, aes(x = blood_glucose_level),
fill = “magenta”, alpha = 0.5) +
labs(title = “Blood Glucose Density: Diabetes vs. Heart Disease”, x =
“Blood Glucose Level”, y = “Density”) + theme_minimal()
diabetes_dataset
## # A tibble: 100,000 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 80 0 1 never 25.2 6.6
## 2 Female 54 0 0 No Info 27.3 6.6
## 3 Male 28 0 0 never 27.3 5.7
## 4 Female 36 0 0 current 23.4 5
## 5 Male 76 1 1 current 20.1 4.8
## 6 Female 20 0 0 never 27.3 6.6
## 7 Female 44 0 0 never 19.3 6.5
## 8 Female 79 0 0 No Info 23.9 5.7
## 9 Male 42 0 0 never 33.6 4.8
## 10 Female 32 0 0 never 27.3 5
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>% filter(age >= 2, diabetes == 1)
heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(age >= 2, heart_disease == 1)
bmi_older_than_two <- diabetes_dataset %>% select(age, bmi) %>% filter(age >= 2)
ggplot() +
geom_density(data = diabetes_only, aes(x = age), fill = "blue", alpha = 0.5) + # Diabetes cases
geom_density(data = heart_disease_only, aes(x = age), fill = "red", alpha = 0.5) + # Heart disease only
#geom_density(data = bmi_older_than_two, aes(x = age), fill = "magenta", alpha = 0.5) +
labs(title = "Age Distribution: Diabetes vs. Heart Disease",
x = "Age",
y = "Density") +
theme_minimal()
diabetes_dataset
## # A tibble: 100,000 × 9
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 Female 80 0 1 never 25.2 6.6
## 2 Female 54 0 0 No Info 27.3 6.6
## 3 Male 28 0 0 never 27.3 5.7
## 4 Female 36 0 0 current 23.4 5
## 5 Male 76 1 1 current 20.1 4.8
## 6 Female 20 0 0 never 27.3 6.6
## 7 Female 44 0 0 never 19.3 6.5
## 8 Female 79 0 0 No Info 23.9 5.7
## 9 Male 42 0 0 never 33.6 4.8
## 10 Female 32 0 0 never 27.3 5
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>% filter(age >= 2, diabetes == 1)
heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(age >= 2, heart_disease == 1)
former_smoker_only <- diabetes_dataset %>% select(age, smoking_history) %>% filter(age >= 7, smoking_history == 'former')
ggplot() +
geom_density(data = diabetes_only, aes(x = age), fill = "blue", alpha = 0.5) + # Diabetes cases
geom_density(data = heart_disease_only, aes(x = age), fill = "red", alpha = 0.5) + # Heart disease only
labs(title = "Age Distribution: Diabetes vs. Heart Disease",
x = "Age",
y = "Density") +
theme_minimal()
former_smoker_only <- diabetes_dataset %>% select(age, smoking_history) %>% filter(smoking_history == 'former')
#y: 7, o:80
##################!!!!!!!!!!!!!!!!!!!!
#diabetes_dataset %>% select(age, diabetes) %>% filter(diabetes == 1) %>% arrange(age)
# y: 3 o: 80
#diabetes_dataset %>% select(age,heart_disease) %>% filter(heart_disease == 1) %>% arrange(age)
# y: 2 o: 80
library(ggplot2)
library(dplyr)
#install.packages("ggthemes")
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.4.3
diabetes_only <- diabetes_dataset %>% select(age, diabetes) %>% filter(age >= 3, diabetes == 1)
heart_disease_only <- diabetes_dataset %>% select(age,heart_disease) %>% filter(age >= 2, heart_disease == 1)
former_smoker_only <- diabetes_dataset %>% select(age, smoking_history) %>% filter(age >= 7, smoking_history == 'former')
ggplot() +
theme_stata() +
geom_density(data = diabetes_only, aes(x = age), fill = "blue", alpha = 0.5) + # Diabetes cases
geom_density(data = heart_disease_only, aes(x = age), fill = "red", alpha = 0.5) + # Heart disease only
geom_density(data = former_smoker_only, aes(x = age), fill = "cyan", alpha = 0.5)
labs(title = "Age Distribution: Diabetes vs. Heart Disease",
x = "Age",
y = "Density")
## $x
## [1] "Age"
##
## $y
## [1] "Density"
##
## $title
## [1] "Age Distribution: Diabetes vs. Heart Disease"
##
## attr(,"class")
## [1] "labels"
```